To download Natual Language tool kit in terminal type sudo pip install -U nltk
You'll need of gensim package too. To get this package type in the terminal pip install -U gensim.
sudo pip install textblob
sudo pip install fuzzy
In [1]:
import nltk
#nltk.download()
In [2]:
# Sample code to remove noisy words from a text
noise_list = ["is", "a", "this", "..."]
def _remove_noise(input_text):
words = input_text.split()
noise_free_words = [word for word in words if word not in noise_list]
noise_free_text = " ".join(noise_free_words)
return noise_free_text
In [3]:
_remove_noise("this is a sample text")
Out[3]:
Text processing steps:
Text processing pipeline
2) (Noisy Entities Removal) Stopwords, URLs, punctuations, mentions, etc
In [4]:
# Sample code to remove noisy words from a text
noise_list = ["is", "a", "this", "..."]
def _remove_noise(input_text):
words = input_text.split()
noise_free_words = [word for word in words if word not in noise_list]
noise_free_text = " ".join(noise_free_words)
return noise_free_text
In [5]:
_remove_noise("this is a sample text")
Out[5]:
In [6]:
# Sample code to remove a regex pattern
import re
In [7]:
def _remove_regex(input_text, regex_pattern):
urls = re.finditer(regex_pattern, input_text)
for i in urls:
input_text = re.sub(i.group().strip(), '', input_text)
return input_text
In [8]:
regex_pattern = "#[\w]*"
In [9]:
_remove_regex("remove this #hashtag from analytics vidhya", regex_pattern)
Out[9]:
2.2) Lexicon Normalization
For example – “play”, “player”, “played”, “plays” and “playing” are the different variations of the word – “play”, Though they mean different but contextually all are similar. The step converts all the disparities of a word into their normalized form (also known as lemma).
Stemming: Stemming is a rudimentary rule-based process of stripping the suffixes (“ing”, “ly”, “es”, “s” etc) from a word.
Lemmatization: Lemmatization, on the other hand, is an organized & step by step procedure of obtaining the root form of the word, it makes use of vocabulary (dictionary importance of words) and morphological analysis (word structure and grammar relations).
In [11]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
In [12]:
lem = WordNetLemmatizer()
stem = PorterStemmer()
In [13]:
word = "multiplying"
In [14]:
lem.lemmatize(word, "v")
Out[14]:
In [15]:
stem.stem(word)
Out[15]:
2.3) Object Standardization
Text data often contains words or phrases which are not present in any standard lexical dictionaries. These pieces are not recognized by search engines and models.
Some of the examples are – acronyms, hashtags with attached words, and colloquial slangs. With the help of regular expressions and manually prepared data dictionaries, this type of noise can be fixed, the code below uses a dictionary lookup method to replace social media slangs from a text.
In [17]:
lookup_dict = {'rt':'Retweet', 'dm':'direct message', "awsm" : "awesome", "luv" :"love"}
In [24]:
def _lookup_words(input_text):
words = input_text.split()
new_words = []
for word in words:
if word.lower() in lookup_dict:
word = lookup_dict[word.lower()]
new_words.append(word) new_text = " ".join(new_words)
return new_text
In [25]:
_lookup_words("RT this is a retweeted tweet by Shivam Bansal")
3.Text to Features (Feature Engineering on text data)
In [26]:
from nltk import word_tokenize, pos_tag
In [27]:
text = "I am learning Natural Language Processing on Analytics Vidhya"
In [28]:
tokens = word_tokenize(text)
In [29]:
print pos_tag(tokens)
In [30]:
pos_tag(tokens)
Out[30]:
In [31]:
print (pos_tag(tokens) )
In [32]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
In [33]:
doc_complete = [doc1, doc2, doc3]
In [34]:
doc_clean = [doc.split() for doc in doc_complete]
In [43]:
import gensim
from gensim import corpora
In [44]:
# Creating the term dictionary of our corpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)
In [45]:
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
In [46]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
In [47]:
# Running and Training LDA model on the document term matrix
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
In [48]:
# Results
print(ldamodel.print_topics())
In [49]:
def generate_ngrams(text, n):
words = text.split()
output = []
for i in range(len(words)-n+1):
output.append(words[i:i+n])
return output
In [50]:
generate_ngrams('this is a sample text', 2)
Out[50]:
In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [52]:
obj = TfidfVectorizer()
In [53]:
corpus = ['This is sample document.', 'another random document.', 'third sample document text']
In [54]:
X = obj.fit_transform(corpus)
In [55]:
print (X)
In [56]:
from gensim.models import Word2Vec
In [57]:
sentences = [['data', 'science'], ['vidhya', 'science', 'data', 'analytics'],['machine', 'learning'], ['deep', 'learning']]
In [58]:
# train the model on your corpus
model = Word2Vec(sentences, min_count = 1)
In [60]:
print (model.similarity('data', 'science'))
In [61]:
print (model['learning'])
In [64]:
from textblob.classifiers import NaiveBayesClassifier as NBC
In [65]:
from textblob import TextBlob
In [66]:
training_corpus = [
('I am exhausted of this work.', 'Class_B'),
("I can't cooperate with this", 'Class_B'),
('He is my badest enemy!', 'Class_B'),
('My management is poor.', 'Class_B'),
('I love this burger.', 'Class_A'),
('This is an brilliant place!', 'Class_A'),
('I feel very good about these dates.', 'Class_A'),
('This is my best work.', 'Class_A'),
("What an awesome view", 'Class_A'),
('I do not like this dish', 'Class_B')]
In [67]:
test_corpus = [
("I am not feeling well today.", 'Class_B'),
("I feel brilliant!", 'Class_A'),
('Gary is a friend of mine.', 'Class_A'),
("I can't believe I'm doing this.", 'Class_B'),
('The date was good.', 'Class_A'), ('I do not enjoy my job', 'Class_B')]
In [68]:
model = NBC(training_corpus)
In [69]:
print((model.classify("Their codes are amazing.")))
In [70]:
print((model.classify("I don't like their computer.")))
In [71]:
print((model.accuracy(test_corpus)))
In [73]:
from sklearn.feature_extraction.text
In [80]:
#import TfidfVectorizer from sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer
In [93]:
#import classification_report
from sklearn import metrics
from sklearn.metrics import classification_report
In [76]:
from sklearn import svm
In [81]:
#from sklearn import sklearn.feature_extraction.text
In [83]:
# preparing data for SVM model (using the same training_corpus, test_corpus from naive bayes example)
train_data = []
train_labels = []
for row in training_corpus:
train_data.append(row[0])
train_labels.append(row[1])
test_data = []
test_labels = []
for row in test_corpus:
test_data.append(row[0])
test_labels.append(row[1])
In [84]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df=4, max_df=0.9)
In [85]:
# Train the feature vectors
train_vectors = vectorizer.fit_transform(train_data)
In [86]:
# Apply model on test data
test_vectors = vectorizer.transform(test_data)
In [87]:
# Perform classification with SVM, kernel=linear
model = svm.SVC(kernel='linear')
In [88]:
model.fit(train_vectors, train_labels)
Out[88]:
In [89]:
prediction = model.predict(test_vectors)
In [94]:
print ((classification_report(test_labels, prediction)))
In [95]:
def levenshtein(s1,s2):
if len(s1) > len(s2):
s1,s2 = s2,s1
distances = range(len(s1) + 1)
for index2,char2 in enumerate(s2):
newDistances = [index2+1]
for index1,char1 in enumerate(s1):
if char1 == char2:
newDistances.append(distances[index1])
else:
newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1])))
distances = newDistances
return distances[-1]
In [96]:
print(levenshtein("analyze","analyse"))
In [100]:
import fuzzy
In [98]:
soundex = fuzzy.Soundex(4)
In [99]:
print (soundex('ankit'))
In [ ]:
print soundex('aunkit')
In [ ]:
import math
In [ ]:
from collections import Counter
In [ ]:
def get_cosine(vec1, vec2):
common = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in common])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
In [ ]:
def text_to_vector(text):
words = text.split()
return Counter(words)
In [101]:
text1 = 'This is an article on analytics vidhya'
text2 = 'article on analytics vidhya is about natural language processing'
In [ ]:
vector1 = text_to_vector(text1)
In [ ]:
vector2 = text_to_vector(text2)
In [ ]:
cosine = get_cosine(vector1, vector2)